import pandas as pd
import plotly.express as px
# Use io.StringIO to treat the string as a file
data_file = '../datasets/population_area.csv'
# Read the CSV into a pandas DataFrame
# Use quotechar='"' to handle commas within quoted fields like the country name
df = pd.read_csv(data_file)
# Optional: Clean up column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()
# 2. Get User Input for Multiple Countries
# Display available countries/regions to help the user
available_countries = sorted(df['Country'].unique()) # Sort for easier reading
countries_input = 'United States of America,Canada,Japan,Germany,India,Indonesia,Brazil,South Africa,Niger'
indicators_inputs = ['Population aged 0 to 14 years old (percentage)','Sex ratio (males per 100 females)']
# Parse the input string into a list of names, stripping whitespace
selected_countries = [name.strip() for name in countries_input.split(',') if name.strip()]
if not selected_countries:
print("No country names entered. Exiting.")
else:
# 3. Filter Data for ALL selected countries
df_filtered = df[df['Country'].isin(selected_countries)].copy() # Use .copy() to avoid SettingWithCopyWarning
df_filtered = df_filtered[df_filtered['Indicator'].isin(indicators_inputs)].copy()
# 4. Handle No/Partial Data
found_countries = df_filtered['Country'].unique()
not_found = set(selected_countries) - set(found_countries)
if not found_countries.any(): # Check if the filtered dataframe is empty
print(f"No data found for any of the specified countries: {', '.join(selected_countries)}")
else:
if not_found:
print(f"\nWarning: No data found for the following requested countries: {', '.join(not_found)}")
print(f"Plotting data for: {', '.join(found_countries)}")
else:
print(f"\nFound data for: {', '.join(found_countries)}")
# 5. Data Cleaning
df_filtered['Value'] = df_filtered['Value'].str.replace(',', '', regex=False)
df_filtered['Value'] = pd.to_numeric(df_filtered['Value'], errors='coerce')
df_filtered.dropna(subset=['Value'], inplace=True)
# Sort by country and year for potentially cleaner line connections
df_filtered.sort_values(['Country', 'year'], inplace=True)
# Check again if data remains after cleaning
if df_filtered.empty:
print(f"No valid numeric indicator data found for the selected countries after cleaning.")
else:
# 6. Plot Data
print(f"\nGenerating plot for selected countries...")
# --- Plotting Strategy ---
# We need to distinguish between countries AND indicators.
# Option 1: Color by Country, Line Style by Indicator (good if few indicators)
# Option 2: Color by Indicator, Line Style by Country (good if few countries)
# Let's use Option 1 as a default, adding symbols for clarity.
title_countries = ', '.join(found_countries)
if len(title_countries) > 60: # Truncate title if too long
title_countries = title_countries[:57] + "..."
fig = px.line(
df_filtered,
x='year',
y='Value',
color='Country', # Different color for each country
line_dash='Indicator', # Different line style for each indicator
symbol='Indicator', # Different marker shape for each indicator
markers=True, # Show markers on the lines
title=f'Indicators for {title_countries} over Time',
hover_data=['Indicator'] # Show indicator name clearly on hover
)
# Customize layout (optional)
fig.update_layout(
xaxis_title='Year',
yaxis_title='Indicator Value',
legend_title='Legend' # Generic legend title
)
# Improve legend clarity (optional, might make it large)
# fig.update_layout(legend={'traceorder': 'grouped'})
# 7. Show Plot
fig.show()